#Authors: Ian McElveen and Cecilia Gonzales
#Author Date: 6/9/25
#Purpose: The purpose of this notebook is to house all data set transformation, cleansing, visualization, statistical analysis, and note-taking for the 2025 CU Athletic Department Sports Science Internship Program
#LAST UPDATED: 6/12/2025
#Including helpful libraries
library(tidyverse)
library(readxl)
library(aod)
library(dplyr)
library(ggplot2)
library(lubridate)
library(boot)
library(ROCR)
library(purrr)
library(ggforce)
library(lme4)
library(cv)
library(caret)
library(rsample)
library(yardstick)
library(knitr)
#Example of importing our first data set from a CSV file to a data frame.
# df <- read.csv("./foo/bar.csv")
#Load in MBB - Statistic Tracking Report csv
stats <- read.csv("data-sets/data-sets-uncompressed/data-sets-compressed/Reactive Strength Index/MBB - Statistic Tracking Report.csv")
# Load in ACWR - Kinexon MBB
kinexon_ACWR <- read.csv("data-sets/data-sets-uncompressed/data-sets-compressed/Reactive Strength Index/ACWR - Kinexon MBB.csv")
#Loading in VALD - Dynamo MBB
VALD_dynamo <- read.csv("data-sets/data-sets-uncompressed/data-sets-compressed/Reactive Strength Index/VALD - Dynamo MBB.csv")
# Load in VALD - ForceDecks MBB csv
VALD_ForceDecks <- read.csv("data-sets/data-sets-uncompressed/data-sets-compressed/Reactive Strength Index/VALD - ForceDecks MBB.csv")
# Load in Kinexon Session
kinexon_session <- read.csv("data-sets/data-sets-uncompressed/data-sets-compressed/Reactive Strength Index/Kinexon Session MBB.csv")
# Remove columns where all data points are NA
stats_clean <- stats |>
select(where(~ !all(is.na(.))))
# If all three date columns are equal, remove the redundant ones
if (all(stats_clean$Date == stats_clean$Date.Reverse, na.rm = TRUE) &&
all(stats_clean$Date == stats_clean$Session.Date, na.rm = TRUE)) {
stats_clean <- stats_clean |>
select(-c(Date.Reverse, Session.Date))
}
# Convert if Date column is NOT already in Date format
if (!inherits(stats_clean$Date, "Date")) {
stats_clean <- stats_clean %>%
mutate(Date = as.Date(Date, format = "%m/%d/%Y"))
}
# Confirm it's now in Date format
class(stats_clean$Date)
[1] "Date"
stats_clean <- stats_clean[!is.na(stats_clean$Plus..Minus),]
# Add more stats
stats_clean <- stats_clean %>%
filter(Date >= as.Date("2024-09-20", '%Y-%m-%d')) %>%
group_by(anon_id) %>%
mutate(Player.Average.Rebounds = mean(na.omit(Rebounds)),
Player.Average.Steals = mean(na.omit(Steals)),
Player.Average.Assists = mean(na.omit(Assists)),
Player.Average.Turnovers = mean(na.omit(Turnovers)),
Player.Average.Points.Scored = mean(na.omit(Points.Scored)),
Player.Median.Rebounds = median(na.omit(Rebounds)),
Player.Median.Steals = median(na.omit(Steals)),
Player.Median.Assists = median(na.omit(Assists)),
Player.Median.Turnovers = median(na.omit(Turnovers)),
Player.Median.Points.Scored = median(na.omit(Points.Scored))) %>%
ungroup() %>%
group_by(Date) %>%
mutate(Team.Rebounds = sum(na.omit(Rebounds)),
Team.Steals = sum(na.omit(Steals)),
Team.Assists = sum(na.omit(Assists)),
Team.Turnovers = sum(na.omit(Turnovers)),
Team.Points.Scored = as.numeric(str_split(Game.Score, "-")[[1]][1])) %>%
ungroup() %>%
mutate(Team.Average.Rebounds = mean(Team.Rebounds),
Team.Average.Steals = mean(Team.Steals),
Team.Average.Assists = mean(Team.Assists),
Team.Average.Turnovers = mean(Team.Turnovers),
Team.Average.Points.Scored = mean(Team.Points.Scored)) %>%
mutate(Player.Good.Game = ifelse(Rebounds >= Player.Median.Rebounds & Steals >= Player.Median.Steals & Assists >= Player.Median.Assists,1,0),
Player.Bad.Game = ifelse(Rebounds < Player.Median.Rebounds & Steals < Player.Median.Steals & Assists < Player.Median.Assists,1,0),
Team.Good.Game = ifelse(Team.Rebounds >= median(Team.Rebounds) & Team.Steals >= median(Team.Steals) & Team.Assists >= median(Team.Assists),1,0),
Team.Bad.Game = ifelse(Team.Rebounds < median(Team.Rebounds) & Team.Steals < median(Team.Steals) & Team.Assists < median(Team.Assists),1,0)) %>%
select(-c(Sport, Day.of.Week, Month, Month.with.Number, Year, Number.of.Competitions.Won, Total.Daily.Competitions.Available, Total.Competitions.Won, Total.Game.Minutes, Offensive.Rebounds, Defensive.Rebounds, Buff.Plays, Percentage.Daily.of.Competitions.Won)) %>%
filter(!(anon_id %in% c("ID_10", "ID_14", "ID_15", "ID_18", "ID_54", "ID_64")))
# Remove columns where all data points are NA
kinexon_ACWR_clean <- kinexon_ACWR |>
select(where(~ !all(is.na(.))))
# If all three date columns are equal, remove the redundant ones
if (all(kinexon_ACWR_clean$Date == kinexon_ACWR_clean$Date.Reverse, na.rm = TRUE) &&
all(kinexon_ACWR_clean$Date == kinexon_ACWR_clean$Session.Date, na.rm = TRUE)) {
kinexon_ACWR_clean <- kinexon_ACWR_clean |>
select(-c(Date.Reverse, Session.Date))
}
# Convert if Date column is NOT already in Date format
if (!inherits(kinexon_ACWR_clean$Date, "Date")) {
kinexon_ACWR_clean <- kinexon_ACWR_clean %>%
mutate(Date = as.Date(Date, format = "%m/%d/%Y"))
}
# Confirm it's now in Date format
class(kinexon_ACWR_clean$Date)
[1] "Date"
# Selecting only relevant columns
kinexon_ACWR_clean <- kinexon_ACWR_clean %>%
select(anon_id, Date, Day.of.the.Week, Month, Year, Position, Session.Type, Session.Description, Practice.Classification, Last.7.Day.Acceleration.Load, Today.s.Acceleration.Load, Acceleration.Load.ACWR, Acceleration.Load.EWMA.ACWR, Acute.Total.Acceleration.Load, Acute.Total.Acceleration.Load.EWMA, Chronic.Total.Acceleration.Load, Chronic.Total.Acceleration.Load.EWMA, Total.Low.Acceleration.Load, Total.Medium.Acceleration.Load, Total.High.Acceleration.Load, Total.Very.High.Acceleration.Load, Last.7.Day.Total.Jump.Load, Today.s.Jump.Load, Total.Jump.Load.ACWR, Acute.Total.Jump.Load, Acute.Total.Jump.Load.EWMA, Chronic.Total.Jump.Load, Chronic.Total.Jump.Load.EWMA, Last.7.Day.Total.Minutes, Today.s.Minutes, Minutes.ACWR, Minutes.EWMA.ACWR, Acute.Minutes, Acute.Minutes.EWMA, Chronic.Minutes, Chronic.Minutes.EWMA, Total.Distance.EWMA.ACWR)
#selecting important variables, removing rows with missing data, only selecting hand movements
VALD_dynamo_clean <- VALD_dynamo %>%
filter(Body.Region == "Hand") %>%
mutate(Right.Side = ifelse(Repetition.Type.Laterality=="RightSide",1,0)) %>%
select(anon_id, Date, Max.Force.Newtons, Avg.Force.Newtons, Max.Impulse.Newton.Seconds, Max.Rate.Of.Force.Development.Newtons.Per.Second, Avg.Rate.Of.Force.Development.Newtons.Per.Second, Min.Time.To.Peak.Force.Seconds, Start.Offset.Seconds, Repetition.Duration.Seconds, Repetition.Max.Force.Newtons, Impulse.Newton.Seconds, Rate.Of.Force.Development.Newtons.Per.Second, Time.To.Peak.Force.Seconds,Right.Side) %>%
na.omit()
# Convert if Date column is NOT already in Date format
if (!inherits(VALD_dynamo_clean$Date, "Date")) {
VALD_dynamo_clean<- VALD_dynamo_clean%>%
mutate(Date = as.Date(Date, format = "%m/%d/%Y"))
}
# Confirm it's now in Date format
class(VALD_dynamo_clean$Date)
[1] "Date"
#This data set contains information from each individual hand in a strength test meaning that there are two measurements per day for each player. This makes it hard to understand in plots especially when plotted like time series data. For future reference, it might be useful to take an average of both sides or note that there is a difference in each hand and plot the left and rights sides separately.
#### Note that you have to group on id and the date in order to get a valid average of both sides for a specific day for each individual player.
VALD_ForceDecks_clean <- VALD_ForceDecks %>%
select(where(~ !all(is.na(.)))) %>%
mutate(X=1:3384) %>%
filter(Test.Type == "DJ") %>%
select(X,anon_id, Date, Week, Position, Test.Type, Weight..kg., Trial, Flight.Time..s., Jump.Height..Flight.Time...cm., Eccentric.Concentric.Mean.Force.Ratio, Contact.Time..s., RSI..JH..Flight.Time..Contact.Time., RSI..Flight.Time.Contact.Time., Drop.Height..cm.)%>%
mutate(RSI..m.per.s. = (Jump.Height..Flight.Time...cm./100)/Contact.Time..s.,
Date = as.Date(Date, '%m/%d/%Y')) %>%
group_by(Date, anon_id) %>%
mutate(Weight..kg. = mean(na.omit(Weight..kg.)),
Player.Average.RSI..m.per.s. = mean(RSI..m.per.s.)) %>%
ungroup() %>%
group_by(Date) %>%
mutate(Team.Average.RSI..m.per.s. = mean(na.omit(RSI..m.per.s.)))
# Remove columns where all data points are NA
kinexon_session_clean <- kinexon_session |>
select(where(~ !all(is.na(.))))
# Convert if Date column is NOT already in Date format
if (!inherits(kinexon_session_clean$Date, "Date")) {
kinexon_session_clean <- kinexon_session_clean %>%
mutate(Date = as.Date(Date, format = "%m/%d/%Y"))
}
# Confirm it's now in Date format
class(kinexon_session_clean$Date)
[1] "Date"
# Filter to only include data from this season
kinexon_session_clean <- kinexon_session_clean |>
filter(Date >= as.Date("2024-6-01"))
##Analysis
# Some basic visualizations
VALD_ForceDecks_clean <- VALD_ForceDecks_clean |>
filter(Date >= as.Date("2024-10-10"))
# Boxplot showing RSI distribution for each athlete
ggplot(VALD_ForceDecks_clean, aes(anon_id, RSI..m.per.s., color = anon_id)) +
geom_boxplot() +
labs(title = "RSI Variability by Athlete") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Line and scatter plot showing RSI trends over time for each athlete
# Includes a black line for the team average RSI to compare individual performance to
ggplot(VALD_ForceDecks_clean, aes(Date, Player.Average.RSI..m.per.s., color = anon_id)) +
geom_point() +
geom_smooth(method = "lm", se=FALSE) +
labs(title = "Player RSI Over Time with Team Average Line") +
geom_line(aes(y = Team.Average.RSI..m.per.s.),
color = "black")
`geom_smooth()` using formula = 'y ~ x'
ggplot(data=VALD_ForceDecks_clean, aes(x=Date, y=Player.Average.RSI..m.per.s.)) +
geom_point() +
geom_smooth(method="lm")
#taking out all of the IDs in the data set
IDs <- unique(VALD_ForceDecks_clean$anon_id)
#for all unique IDs, calculate Kendall rank correlation, plot player RSI throughout season
for(i in 1:16){
RSI <- unique(VALD_ForceDecks_clean[VALD_ForceDecks_clean$anon_id == IDs[i],]$Player.Average.RSI..m.per.s.)
kendall_cor <- cor(1:length(RSI), RSI, method="kendall") #calculating Kendall correlation
#plotting each player's RSI throughout the season with their correlation and rough model of the trend
p <- ggplot(data = VALD_ForceDecks_clean[VALD_ForceDecks_clean$anon_id == IDs[i],], aes(x=Date, y=Player.Average.RSI..m.per.s.)) +
geom_point() +
geom_smooth(method="lm") +
labs(title = kendall_cor, subtitle = IDs[i])
print(p)
}
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
#for all the players, calculate mean and variance of their RSI measurements and plot
for(i in 1:16){
#for every player calculate mean and variance using time series assumptions and functions from the time series functions file
mv <- sample_mean_and_variance(unique(VALD_ForceDecks_clean[VALD_ForceDecks_clean$anon_id == IDs[i],]$Player.Average.RSI..m.per.s.))
#plotting RSI measurements along with calculated mean and 1 standard deviation above and below mean for each player along with their measured RSIs throughout the season
p <- ggplot(data = VALD_ForceDecks_clean[VALD_ForceDecks_clean$anon_id == IDs[i],], aes(x=Date, y=Player.Average.RSI..m.per.s.)) +
geom_point() +
geom_hline(yintercept = mv[[1]]) +
geom_hline(yintercept = mv[[1]] + sqrt(mv[[2]]), color="#CFB87C") +
geom_hline(yintercept = mv[[1]] - sqrt(mv[[2]]), color="#CFB87C") +
labs(title = "RSI Measurements Throughout Season with Mean and 1 Standard Deviation",subtitle = IDs[i])
#printing out each plot
print(p)
}
VALD_ForceDecks_clean$Player.Above.1.SD <- rep(NA,565)
VALD_ForceDecks_clean$Player.Below.1.SD <- rep(NA,565)
VALD_ForceDecks_clean$Player.Above.2.SD <- rep(NA,565)
VALD_ForceDecks_clean$Player.Below.2.SD <- rep(NA,565)
for(i in 1:16){
mv <- sample_mean_and_variance(unique(VALD_ForceDecks_clean[VALD_ForceDecks_clean$anon_id == IDs[i],]$Player.Average.RSI..m.per.s.))
sdu1 <- mv[[1]] + sqrt(mv[[2]])
sdl1 <- mv[[1]] - sqrt(mv[[2]])
sdu2 <- mv[[1]] + (2*sqrt(mv[[2]]))
sdl2 <- mv[[1]] - (2*sqrt(mv[[2]]))
#checking if RSI value is below 1 standard deviation for each player
VALD_ForceDecks_clean[VALD_ForceDecks_clean$anon_id == IDs[i],]$Player.Below.1.SD <- ifelse((VALD_ForceDecks_clean[VALD_ForceDecks_clean$anon_id == IDs[i],]$Player.Average.RSI..m.per.s.<sdl1), 1, 0)
#checking if RSI value is above 1 standard deviation for each player
VALD_ForceDecks_clean[VALD_ForceDecks_clean$anon_id == IDs[i],]$Player.Above.1.SD <- ifelse((VALD_ForceDecks_clean[VALD_ForceDecks_clean$anon_id == IDs[i],]$Player.Average.RSI..m.per.s.>sdu1), 1, 0)
#checking if RSI value is below 2 standard deviations for each player
VALD_ForceDecks_clean[VALD_ForceDecks_clean$anon_id == IDs[i],]$Player.Below.2.SD <- ifelse((VALD_ForceDecks_clean[VALD_ForceDecks_clean$anon_id == IDs[i],]$Player.Average.RSI..m.per.s.<sdl2), 1, 0)
#checking if RSI value is above 2 standard deviations for each player
VALD_ForceDecks_clean[VALD_ForceDecks_clean$anon_id == IDs[i],]$Player.Above.2.SD <- ifelse((VALD_ForceDecks_clean[VALD_ForceDecks_clean$anon_id == IDs[i],]$Player.Average.RSI..m.per.s.>sdu2), 1, 0)
}
#making a plot to show potentially significant RSI measurements throughout the season
ggplot(data=VALD_ForceDecks_clean, aes(x=Date, Player.Average.RSI..m.per.s.)) +
geom_point(color = ifelse(VALD_ForceDecks_clean$Player.Above.2.SD == 1, "palegreen", ifelse(VALD_ForceDecks_clean$Player.Below.2.SD == 1, "red", ifelse(VALD_ForceDecks_clean$Player.Below.1.SD == 1, "red4", ifelse(VALD_ForceDecks_clean$Player.Above.1.SD == 1, "palegreen4", "grey80")))), alpha = 0.4) +
labs(title = "Significant RSI Measurements Throughout the Season")
This plot shows each player’s measured RSI throughout the season. Points that are colored gray are points where the player did not have a significant difference in their RSI from their mean RSI throughout the season. Dark green dots show RSI measurements that are outside of 1 standard deviation for each player and light green dots are RSI measurements that are outside 2 standard deviations for each player. Along the same lines, dark red dots shown points in which the player had an RSI outside of 1 standard deviation from their RSI distribution and light red dots show points in which the player’s measured RSI is outside of 2 standard deviations of their RSI distribution. One interesting thing about this plot is that there are no measurements that are below even 1 standard deviation for any player after February starts. If anything, there seems to be a higher concentration of RSI measurements that are at least above 1 standard deviation from the mean in the last 5 measurements of the season, after February starts than in the rest of the season.
#cycling through all of the players and plotting their RSI measurements throughout the season
for(i in 1:16){
#plot the measured RSI for each player throughout the season
#dark green means point is above 1 standard deviation from mean
#light green means point is above 2 standard deviations from mean
#dark red means point is below 1 standard deviation from mean
#light red means point is below 2 standard deviations from mean
p <- ggplot(data=VALD_ForceDecks_clean[VALD_ForceDecks_clean$anon_id == IDs[i],], aes(x=Date, Player.Average.RSI..m.per.s.)) +
geom_point(color = ifelse(VALD_ForceDecks_clean[VALD_ForceDecks_clean$anon_id == IDs[i],]$Player.Above.2.SD == 1, "palegreen", ifelse(VALD_ForceDecks_clean[VALD_ForceDecks_clean$anon_id == IDs[i],]$Player.Below.2.SD == 1, "red", ifelse(VALD_ForceDecks_clean[VALD_ForceDecks_clean$anon_id == IDs[i],]$Player.Below.1.SD == 1, "red4", ifelse(VALD_ForceDecks_clean[VALD_ForceDecks_clean$anon_id == IDs[i],]$Player.Above.1.SD == 1, "palegreen4", "grey70"))))) +
ylim(0.7311404, 2.963925) +
labs(title = "Measured RSI Values Throughout the Season and Their Significance")
print(p)
}